> project_summary = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/project-summary.csv"
> counts_file = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/combined.counts"
> tx2genes_file = "/Users/rory/cache/fabio-splicing/new-samples/2016-07-29_fabio-newlines/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+ rownames(summarydata) = summarydata$description
+ summarydata$Name = rownames(summarydata)
+ summarydata$description = NULL
+ } else {
+ rownames(summarydata) = summarydata$Name
+ # summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+ sample_dirs = file.path(dirname(project_summary), "..", rownames(summarydata))
+ salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+ sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+ new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+ new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+ if (file.exists(salmon_files[1])) {
+ sf_files = salmon_files
+ } else if (file.exists(sailfish_files[1])) {
+ sf_files = sailfish_files
+ } else if (file.exists(new_sailfish[1])) {
+ sf_files = new_sailfish
+ } else if (file.exists(new_salmon[1])) {
+ sf_files = new_salmon
+ }
+ names(sf_files) = rownames(summarydata)
+ tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+ txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv,
+ countsFromAbundance = "lengthScaledTPM")
+ counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+ counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
>
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality",
+ "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped",
+ "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.",
+ "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity",
+ "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size",
+ "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> metadata$treatment = relevel(metadata$treatment, ref = "Ctrl")
> metadata$bort = relevel(metadata$bort, ref = "N")
> metadata$e7107 = relevel(metadata$e7107, ref = "N")
> mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
> t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
+ "external_gene_name", "gene_biotype", "transcript_biotype"), mart = mart)
> t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id,
+ ext_gene = external_gene_name)
> symbols = unique(t2g[, c("ens_gene", "ext_gene", "gene_biotype")])
> sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-",
+ "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)
> get_heatmap_fn = function(summarydata) {
+ # return the pheatmap function with or without metadata
+ if (ncol(metadata) == 0) {
+ return(pheatmap)
+ } else {
+ # rownames(metadata) = summarydata$Name
+ heatmap_fn = function(data, ...) {
+ pheatmap(data, annotation = metadata, clustering_method = "ward.D2",
+ clustering_distance_cols = "correlation", ...)
+ }
+ return(heatmap_fn)
+ }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)
> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)
Overall mapped reads looks good.
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
Genomic mapping rate looks great.
> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
Good number of genes detected.
> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
+ xlab("")
This plot looks a little off, but it is the way the axis are. It is fine.
> col_mapped = ifelse(qualimap_run, "Mapped", "Mapped.reads")
> dd = data.frame(Mapped = summarydata[, col_mapped], Genes.Detected = colSums(counts >
+ 0))
> ggplot(dd, aes(x = Mapped, y = Genes.Detected)) + geom_point() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ ylab("genes detected") + xlab("reads mapped")
Here we are starting to see some issues that might be batch effects. Ctrl and Bort1 samples have slightly higher exonic mapping rate.
> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
+ xlab("")
They also have a lower rRNA rate:
> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) ==
+ nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") +
+ xlab("")
and a higher estimated fragment size
> ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("fragment length") +
+ xlab("")
> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") +
+ xlab("")
> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
> ggplot(melted, aes(x = count, group = sample)) + geom_density() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
The control and bort samples are more similar to each other than the other samples. We won’t be able to tell if that is due to real differences or some of the batch differences we saw above.
> heatmap_fn(cor(normalized_counts, method = "pearson"))
> heatmap_fn(cor(normalized_counts, method = "spearman"))
We can see the same effects on the PCA plot. The different treatments separate out the cells, bort + control are more similar to each other and E7107 + BE are more similar to each other. It looks like the E7107 treatment is the strongest effect as most of the variance is explained by whether or not the samples have been treated with E7107.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+ rv <- matrixStats::rowVars(assay(object))
+ select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+ pca <- prcomp(t(assay(object)[select, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "treatment"
> pca_plot = function(comps, nc1, nc2, colorby) {
+ c1str = paste0("PC", nc1)
+ c2str = paste0("PC", nc2)
+ ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() +
+ theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100),
+ "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] *
+ 100), "% variance"))
+ }
> pca_plot(comps, 1, 2, colorby)
> pca_plot(comps, 3, 4, colorby)
> pca_plot(comps, 5, 6, colorby)
> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar),
+ percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") +
+ ylab("percent of total variation") + xlab("") + theme_bw()
> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+ counts <- round(txi$counts)
+ mode(counts) <- "integer"
+ dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design,
+ ...)
+ stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+ if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+ message("using length scaled TPM counts from tximport")
+ } else {
+ message("using counts and average transcript lengths from tximport")
+ lengths <- txi$length
+ dimnames(lengths) <- dimnames(dds)
+ assays(dds)[["avgTxLength"]] <- lengths
+ }
+ return(dds)
+ }
>
> subset_tximport = function(txi, rows, columns) {
+ txi$counts = txi$counts[rows, columns]
+ txi$abundance = txi$abundance[rows, columns]
+ txi$length = txi$length[rows, columns]
+ return(txi)
+ }
> library(DEGreport)
> library(vsn)
> design = ~treatment
> condition = "treatment"
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
> par(mfrow = c(1, 3))
> notAllZero <- (rowSums(counts(dds)) > 0)
> rld <- rlog(dds)
> vsd <- varianceStabilizingTransformation(dds)
> rlogMat <- assay(rld)
> vstMat <- assay(vsd)
>
> meanSdPlot(log2(counts(dds, normalized = TRUE)[notAllZero, ] + 1))
> meanSdPlot(assay(rld[notAllZero, ]))
> meanSdPlot(assay(vsd[notAllZero, ]))
The dispersion estimates are super low, I guess because it is cell lines? Just to be clear, these are biological replicates right? Different replicates of the same treatment were done on different days to different batches?
replicates;
> plotDispEsts(dds)
> e7107 = results(dds, contrast = c("treatment", "E7107", "Ctrl"))
> bort = results(dds, contrast = c("treatment", "Bort", "Ctrl"))
> be = results(dds, contrast = c("treatment", "BE", "Ctrl"))
> plotMA_full = function(res) {
+ ymax = max(res$log2FoldChange, na.rm = TRUE)
+ ymin = min(res$log2FoldChange, na.rm = TRUE)
+ plotMA(res, ylim = c(ymin, ymax))
+ }
> volcano = function(res) {
+ stats = as.data.frame(res[, c(2, 6)])
+ p = volcano_density_plot(stats, lfc.cutoff = 1.5)
+ print(p)
+ }
> write_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ens_gene")) %>% arrange(padj)
+ write.table(out_df, file = fname, quote = FALSE, sep = ",", row.names = FALSE,
+ col.names = TRUE)
+ }
> plotMA_full(bort)
> volcano(bort)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.178]
> write_deseq(bort, "control-vs-bort-deseq2.csv")
> plotMA_full(e7107)
> volcano(e7107)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.346]
> write_deseq(e7107, "control-vs-e7107-deseq2.csv")
> plotMA_full(be)
> volcano(be)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.514]
> write_deseq(be, "control-vs-be-deseq2.csv")
> library(dplyr)
> library(sleuth)
> sf_dirs = file.path("..", "..", rownames(summarydata), "sailfish", "quant")
> names(sf_dirs) = rownames(summarydata)
> sfdata = metadata
> sfdata$sample = rownames(sfdata)
> sfdata$path = sf_dirs
> mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
> t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
+ "external_gene_name", "gene_biotype", "transcript_biotype"), mart = mart)
> t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id,
+ ext_gene = external_gene_name)
> library(biomaRt)
> so = sleuth_prep(sfdata, design, target_mapping = t2g) %>% sleuth_fit()
> models(so)
> save(so, file = "sleuth_models.RData")
> rm(so)
> load("sleuth_models.RData")
> so = sleuth_wt(so, "treatmentBE")
> so = sleuth_wt(so, "treatmentBort")
> so = sleuth_wt(so, "treatmentE7107")
> bort_tab = sleuth_results(so, "treatmentBort", show_all = TRUE)
> e7107_tab = sleuth_results(so, "treatmentE7107", show_all = TRUE)
> be_tab = sleuth_results(so, "treatmentBE", show_all = TRUE)
> reciprocal_sleuth = function(tab) {
+ tab %>% na.omit() %>% group_by(ens_gene) %>% mutate(reciprocal = (sum(b >
+ 0 & qval < 0.05) > 0) & sum(b < 0 & qval < 0.05) > 0 & length(ens_gene) >
+ 1) %>% as.data.frame()
+ }
> bort_tab = reciprocal_sleuth(bort_tab)
> e7107_tab = reciprocal_sleuth(e7107_tab)
> be_tab = reciprocal_sleuth(be_tab)
> write.table(bort_tab, "control-vs-bort-sleuth.csv", quote = FALSE, row.names = FALSE,
+ col.names = TRUE, sep = ",")
> write.table(e7107_tab, "control-vs-e7107-sleuth.csv", quote = FALSE, row.names = FALSE,
+ col.names = TRUE, sep = ",")
> write.table(be_tab, "control-vs-be-sleuth.csv", quote = FALSE, row.names = FALSE,
+ col.names = TRUE, sep = ",")
> library(cowplot)
> sleuth_MA = function(results) {
+ results$DE = results$qval < 0.05
+ ggplot(results, aes(mean_obs, b, color = DE)) + geom_point(size = 0.5) +
+ guides(color = FALSE) + scale_color_manual(values = c("black", "red")) +
+ xlab("mean expression") + ylab(expression("<U+0392>")) + scale_x_log10()
+ }
> sleuth_MA(bort_tab)
> sleuth_MA(e7107_tab)
> sleuth_MA(be_tab)
> orgdb = "org.Hs.eg.db"
> biomart_dataset = "hsapiens_gene_ensembl"
> keggname = "hsa"
> library(dplyr)
> library(clusterProfiler)
> library(orgdb, character.only = TRUE)
> library(biomaRt)
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
> summarize_cp = function(res, comparison) {
+ summaries = data.frame()
+ for (ont in names(res)) {
+ ontsum = summary(res[[ont]])
+ ontsum$ont = ont
+ summaries = rbind(summaries, ontsum)
+ }
+ summaries$comparison = comparison
+ summaries = summaries %>% arrange(qvalue)
+ return(summaries)
+ }
>
> enrich_cp = function(res, comparison) {
+ res = data.frame(res)
+ res = res %>% tibble::rownames_to_column() %>% left_join(entrez, by = c(rowname = "ensembl_gene_id")) %>%
+ filter(!is.na(entrezgene))
+ universe = res$entrezgene
+ genes = subset(res, padj < 0.05)$entrezgene
+ mf = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "MF", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ cc = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "CC", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ bp = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "BP", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ kg = enrichKEGG(gene = genes, universe = universe, organism = "hsa", pvalueCutoff = 1,
+ qvalueCutoff = 1, pAdjustMethod = "BH")
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> gsea_cp = function(res, comparison) {
+ res = res %>% data.frame() %>% tribble::rownames_to_column() %>% left_join(entrez,
+ by = c(rowname = "ensembl_gene_id")) %>% filter(!is.na(entrezgene)) %>%
+ filter(!is.na(log2FoldChange)) %>% filter(!is.na(lfcSE))
+ lfc = data.frame(res)[, "log2FoldChange"]
+ lfcse = data.frame(res)[, "lfcSE"]
+ genes = lfc/lfcse
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ cc = gseGO(genes, ont = "CC", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ mf = gseGO(genes, ont = "MF", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ bp = gseGO(genes, ont = "bp", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ genes = data.frame(res)[, "log2FoldChange"]
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ genes = genes[!is.na(genes)]
+ kg = gseKEGG(geneList = genes, organism = "mmu", nPerm = 500, pvalueCutoff = 1,
+ verbose = TRUE)
+ if (orgdb == "org.Hs.eg.db") {
+ do = summary(gseDO(geneList = genes, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE))
+ do$ont = "DO"
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg, do = do)
+ } else {
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ }
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> write_enrich = function(res, filename) {
+ res = subset(res, qvalue < 0.05)
+ write.table(res, file = filename, quote = FALSE, sep = ",", row.names = FALSE,
+ col.names = TRUE)
+ }
Here we take the set of genes that are flagged as significantly different for each comparison and look for pathways that tend to be enriched for these genes.
> bort_enrich = enrich_cp(bort, "Bort")
> write_enrich(bort_enrich$summary, "control-bort-pathway.csv")
> e7107_enrich = enrich_cp(e7107, "e7107")
> write_enrich(e7107_enrich$summary, "control-e7107-pathway.csv")
> be_enrich = enrich_cp(be, "BE")
> write_enrich(be_enrich$summary, "control-be-pathway.csv")
> tpm = so$obs_norm %>% dplyr::select(target_id, sample, tpm) %>% tidyr::spread(sample,
+ tpm)
> library(biomaRt)
> library(dplyr)
> human = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> symbols = getBM(attributes = c("ensembl_transcript_id", "external_gene_name",
+ "gene_biotype", "transcript_biotype"), mart = human)
> tpm = tpm %>% left_join(symbols, by = c(target_id = "ensembl_transcript_id"))
> write.table(tpm, file = "tpm.csv", quote = FALSE, col.names = TRUE, row.names = FALSE,
+ sep = ",")
Description of these tables:
GeneRatio = (# genes in this set DE) / (total genes DE)
BgRatio = (# genes expressed in this set) / (total genes expressed)
geneID = Entrez IDs of genes that are DE in this set
ont = ontological term tested (MF=molecular function, CC=cellular compartment,
BP=biological process, KG=KEGG pathway)
When we did the pairwise comparisons, it made it hard to answer questions about what the effect of treating with both Bortezomib and E7107 is, without resorting to a crappy solution of doing Venn diagrams of the DE genes.
To get at that question, we’ll look at the data slightly differently, instead of comparing the treated back to the control samples, we instead are going to look for effects that are specific to the treatment with Bortezomib or E7107 alone and also look for genes that are specifically turned on when both are treated. To do that we will re-analyze the data, but fit a model with an interaction term, and then test for the effects of treatment with Bort alone, treatment with E7107 alone and treamtent with both Bort and E7107.
> summarydata$bort = ifelse(summarydata$treatment == "Bort", "yes", ifelse(summarydata$treatment ==
+ "BE", "yes", "no"))
> summarydata$E7107 = ifelse(summarydata$treatment == "E7107", "yes", ifelse(summarydata$treatment ==
+ "BE", "yes", "no"))
> design = ~bort + E7107 + bort:E7107
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
> symbols = unique(t2g[, c("ens_gene", "ext_gene", "gene_biotype")])
> borteff = results(dds, name = "bort_yes_vs_no")
> plotMA_full(borteff)
> volcano(borteff)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1742]
> write_deseq(borteff, "bort-effect.csv")
> e7107eff = results(dds, name = "E7107_yes_vs_no")
> plotMA_full(e7107eff)
> volcano(e7107eff)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1910]
> write_deseq(e7107eff, "E7107-effect.csv")
> combinedeff = results(dds, name = "bortyes.E7107yes")
> plotMA_full(combinedeff)
> volcano(combinedeff)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.2078]
> write_deseq(combinedeff, "combined-effect.csv")
These samples are all very very similar to each other within a sample class, you can see they are almost perfectly correlated using Pearson correlation:
> library(corrplot)
> m = cor(counts)
> corrplot(m, method = "number")
I talked with Praveen and it sounded like these were closer to being technical variants than biological variants in terms of the experimental setup, and the extremely high degree of correlation supports that. If that is true, it might be worthwhile to just look at one sample of each and pretend like the experiment was run with no replicates. We’re in kind of a tough spot because treating technical replicates as biological replicates is wrong, and leads to what we are seeing where there are very high confidence changes when there is only a small difference. If we do it the other way with no replicates, then we lose a lot of power. However if these are actually technical replicates than the ‘power’ we have is illusory since it isn’t based on anything real.
Here we select genes which are significantly different from the control in the combinination treatment and look how they correlate with the individual treatments:
> sig = rownames(subset(be, padj < 0.05 & !is.na(padj) & abs(log2FoldChange) >
+ 1 & baseMean > 50))
> bortrev = (bort[sig, ]$log2FoldChange * be[sig, ]$log2FoldChange) < 0
> names(bortrev) = sig
> e7107rev = (e7107[sig, ]$log2FoldChange * be[sig, ]$log2FoldChange) < 0
> names(e7107rev) = sig
> table(bortrev & !e7107rev)
FALSE TRUE
4067 4411
> table(e7107rev & !bortrev)
FALSE TRUE
8405 73
> table(bortrev & e7107rev)
FALSE TRUE
8460 18
> both = sig[bortrev & e7107rev]
> write_deseq(combinedeff[both, ], "combined-treatment-reversal.csv")
> foldchanges = data.frame(bort = bort[sig, ]$log2FoldChange, e7107 = e7107[sig,
+ ]$log2FoldChange, both = be[sig, ]$log2FoldChange)
> ggplot(foldchanges, aes(bort, both)) + geom_point() + ylab("logFC treatment with bortezomib and E7107") +
+ xlab("logFC treatment with bortezomib")
> ggplot(foldchanges, aes(e7107, both)) + geom_point() + ylab("logFC treatment with bortezomib and E7107") +
+ xlab("logFC treatment with E7107")
There are 4411 genes affected by the combination treatment that have a combined affect in the opposite direction of the bortezomib alone effect. There are 73 genes affected by the combination treatment that have a combined effect in the opposite direction of the E7107 alone effect. And there are 18 genes affected by the combination treatment that have a combined effect in the opposite direction than both the E7107 and bortezomib alone treatments.
Below we select the starting set of genes differently, by looking at the genes with a significant interaction term. These are genes that appear to have a non-additive effect of combining the two treatments:
> sig = rownames(subset(combinedeff, padj < 0.05 & !is.na(padj) & abs(log2FoldChange) >
+ 1 & baseMean > 50))
> bortrev = (bort[sig, ]$log2FoldChange * be[sig, ]$log2FoldChange) < 0
> names(bortrev) = sig
> e7107rev = (e7107[sig, ]$log2FoldChange * be[sig, ]$log2FoldChange) < 0
> names(e7107rev) = sig
> table(bortrev & !e7107rev)
FALSE TRUE
1776 2129
> table(e7107rev & !bortrev)
FALSE TRUE
3766 139
> table(bortrev & e7107rev)
FALSE TRUE
3611 294
> both = sig[bortrev & e7107rev]
> write_deseq(combinedeff[both, ], "combined-treatment-reversal.csv")
> foldchanges = data.frame(bort = bort[sig, ]$log2FoldChange, e7107 = e7107[sig,
+ ]$log2FoldChange, both = be[sig, ]$log2FoldChange)
> ggplot(foldchanges, aes(bort, both)) + geom_point() + ylab("logFC treatment with bortezomib and E7107") +
+ xlab("logFC treatment with bortezomib")
> ggplot(foldchanges, aes(e7107, both)) + geom_point() + ylab("logFC treatment with bortezomib and E7107") +
+ xlab("logFC treatment with E7107")
There are 2129 genes with a non-additive combination treatment effect that have a combined affect in the opposite direction of the bortezomib alone effect. There are 139 genes with a non-additive combination treatment effect that have a combined effect in the opposite direction of the E7107 alone effect. And there are 294 genes with a non-additive treatment effect that have a combined effect in the opposite direction than both the E7107 and bortezomib alone treatments.
Here we looked at all genes that were affected in a non-additive way by the combination of the two drugs. Then we looked at genes that were moving in the same direction in each of the single treatments and the combination treatment, but were more strongly affected in the combination treatment than expected by adding the two treatments together (amplified). Similarly we looked for genes that were less strongly affected than expected by adding the two treatments together (dampened).
> amplified = (e7107[sig, ]$log2FoldChange > 0 & be[sig, ]$log2FoldChange > 0 &
+ bort[sig, ]$log2FoldChange > 0 & combinedeff[sig, ]$log2FoldChange > 0) |
+ (e7107[sig, ]$log2FoldChange < 0 & be[sig, ]$log2FoldChange < 0 & bort[sig,
+ ]$log2FoldChange < 0 & combinedeff[sig, ]$log2FoldChange < 0)
> dampened = (e7107[sig, ]$log2FoldChange > 0 & be[sig, ]$log2FoldChange > 0 &
+ bort[sig, ]$log2FoldChange > 0 & combinedeff[sig, ]$log2FoldChange < 0) |
+ (e7107[sig, ]$log2FoldChange < 0 & be[sig, ]$log2FoldChange < 0 & bort[sig,
+ ]$log2FoldChange < 0 & combinedeff[sig, ]$log2FoldChange > 0)
> fcplot = data.frame(additive = e7107[sig, ]$log2FoldChange + bort[sig, ]$log2FoldChange,
+ both = be[sig, ]$log2FoldChange, amplified = ifelse(amplified, "amplified",
+ ifelse(dampened, "dampened", "neither")))
> ggplot(fcplot, aes(additive, both, color = amplified)) + geom_point() + xlab("theoretical additive effect of both drugs") +
+ ylab("actual effect when both drugs are applied") + labs(color = "")